###############################################################################
## Generating Bureaucracy-Crisis Text Measures
## Schub, "Informing the Leader: Bureaucracies and International Crises", APSR
###############################################################################

## OUTLINE
 ## I.   Load data, dictionaries, dtm, and processed corpus
 ## II.  Format text
 ## III. Generate uncertainty measures
 ## IV.  Generate content measures
 	## Train model 
 	## Apply model 
 ## V.   Validation exercises and manuscript figure 2


##################################
#### Load Libraries
##################################
library(foreign)
library(tm)
library(caret)
library(fastNaiveBayes)


#########################################################
#### I. Load Data and Dictionaries
#########################################################

## set appropriate working directory
setwd("")
## load base data
core<-read.csv("BureaucracyLevel_Raw.csv")
## udict = uncertainty dictionary
load("dictionaries.RData")

#########################################################
#### II. Format Text
#########################################################


## Prepare text
words<-as.character(core$text)

## Format with spaces where needed, remove punctuation, remove most common transcript filler words
text<-gsub(",",", ",words)
text<-gsub(".",". ",text, fixed=TRUE)
text<-gsub("?","? ",text, fixed=TRUE)
# text<-replace_contraction(text) # *alternative pre-processing that addresses contractions; slightly alters the final corpus but produces essentially identical measures with a correlation above 0.999 (requires package "textclean")
text<-gsub("[9][0-9]","",text) 
text<-gsub("([-]|[[:punct:]])"," ", text)
text<-gsub("SPLIT"," ", text) #"split" is used to indicate break between distinct speech segments an individual contributed during a single meeting
text<-gsub("said"," ", text)

## Format as corpus, remove numbers, remove stopwords, and stem remaining words
corp<-Corpus(VectorSource(text))
corp<-tm_map(corp, content_transformer(tolower))
corp<-tm_map(corp, removeNumbers) 
corp<-tm_map(corp, stripWhitespace)
a <- tm_map(corp, removeWords, stopwords("english")) # this stopword file is at C:\Users\[username]\Documents\R\win-library\2.13\tm\stopwords 
a <- tm_map(a, stemDocument, language = "english") 
a <- tm_map(a , stripWhitespace) 

## Create DTM for a raw word count (pre-stopword removal)
dtw<-DocumentTermMatrix(corp, control=list(wordLengths=c(1,Inf))) 


##################################
#### III. Generate Uncertainty Measures
##################################

## Raw word count
core$rawtotwords<-rowSums(as.matrix(dtw)) #raw word count

## Create DTM with processed texts and get a functional word count after pre-processing
adtm<-DocumentTermMatrix(a, control=list(wordLengths=c(1,Inf)))  
core$functotwords.bcracy<-rowSums(as.matrix(adtm)) #functional word count

## Generate main uncertainty measure
uobj<-DocumentTermMatrix(a,list(dictionary=udict))
core$uwords.bcracy<-rowSums(as.matrix(uobj)) #uncertainty dictionary word count per text
core$uper.bcracy<-core$uwords.bcracy/core$functotwords.bcracy
core$uper.bcracy100<-core$uper.bcracy*100

## Generate alternative uncertainty measure
lobj<-DocumentTermMatrix(a,list(dictionary=ldict))
core$lwords.bcracy<-rowSums(as.matrix(lobj)) #alternative uncertainty dictionary word count per text
core$lper.bcracy<-core$lwords.bcracy/core$functotwords.bcracy
core$lper.bcracy100<-core$lper.bcracy*100

## Generate placebo "religous" measure
robj<-DocumentTermMatrix(a,list(dictionary=rdict))
core$rwords.bcracy<-rowSums(as.matrix(robj)) #placebo dictionary word count per text
core$rper.bcracy<-core$rwords.bcracy/core$functotwords.bcracy
core$rper.bcracy100<-core$rper.bcracy*100

# Compare robustness unc dictionary
cor(core$uper.bcracy[core$totwords.bcracy>100],core$lper.bcracy[core$totwords.bcracy>100]) 



##################################
#### IV. Generate Content Measures
##################################

##############################################
### Main measures used throughout manuscript

##############
## Load and clean training data   
trdata<-read.csv("TrainingSet_Raw.csv")
trtext<-as.character(trdata$text)

## Format with spaces where needed and remove punctuation
trtext<-gsub(",",", ",trtext)
trtext<-gsub(".",". ",trtext, fixed=TRUE)
trtext<-gsub("?","? ",trtext, fixed=TRUE)
# trtext<-replace_contraction(trtext) # text<-replace_contraction(text) # *alternative pre-processing that addresses contractions; slightly alters the final training set but produces essentially identical measures with a correlation above 0.999 (requires package "textclean")
trtext<-gsub("([-]|[[:punct:]])"," ", trtext)

## Create and Pre-Process Training Data Corpus
trcorp<-Corpus(VectorSource(trtext))
trcorp<-tm_map(trcorp, content_transformer(tolower))
trcorp<-tm_map(trcorp, removeNumbers)

## Create a document-term matrix and df for training data, stem and remove stop words
trdtm <- DocumentTermMatrix(trcorp, control = list(stemming = T, stopwords = T, minWordLength = 3))

## Remove sparse terms
sparse=removeSparseTerms(trdtm,0.98)

## Make data frame
trdtm <- as.data.frame(as.matrix(sparse))
trdtm <- apply(trdtm,MAR=2,as.integer)

## Remove common transcript filler words
trdf<-as.data.frame(trdtm)
trdftemp<-subset(trdf,select=-c(split, said))
trdtm <-as.matrix(trdftemp)

## Add rownames to dtm
trdtmDocNames<-trdata$files
rownames(trdtm)<-trdtmDocNames
trdtm<-as.data.frame(trdtm)

## Vector with training set hand-coded document classes
coding<-trdata$political
coding2 <- rep(NA,length(coding))
coding2[coding==1] <- "political"
coding2[coding!=1] <- "military"

## Train naive bayes
iv<-trdtm
dv<-coding2
m<-fnb.multinomial(iv,dv,laplace=1)


############################
## Prepare applied data for use  

## Create a document-term matrix and df
dtm <- DocumentTermMatrix(corp, control = list(stemming = T, stopwords = T, minWordLength = 3)) 
dtm <- as.data.frame(as.matrix(dtm))
dtm <- apply(dtm,MAR=2,as.integer)
df<-as.data.frame(dtm)
dtm<-as.matrix(df)

## add rownames to the dtm
dtmDocNames <- c(as.character(core$id))
rownames(dtm) <- dtmDocNames
dtmapplied<-dtm

## subset applied set to terms in training set
trterms<-colnames(trdtm)
dtmsub<-dtm[,colnames(dtm) %in% c(trterms)]

## applied text score: continuous
preda<-predict(m,dtmsub,type="rawprob")
preda<-as.data.frame(preda)
predaratio<-preda$political/preda$military
predaflip<-(1-predaratio)+1
core$fnbflip<-predaflip
fnbflip50<-core$fnbflip[core$functotwords.bcracy>=50] #standardized scores using only texts with at least 50 words
core$fnbstd<-ifelse(core$functotwords.bcracy>=50,(core$fnbflip-min(fnbflip50))/(max(fnbflip50)-min(fnbflip50)),NA) #gen scores for those with over 50 words

## appied text score: binary
predabi<-NA
predabi[predaflip>1 & !is.na(predaflip)]<-"political"
predabi[predaflip<1 & !is.na(predaflip)]<-"military"
core$fnbclass<-predabi
core$fnbpolitical<-ifelse(core$fnbclass=="political",1,0)
core$fnbpolitical[is.na(core$fnbclass)]<-NA


## generate codings for "expert" type when ideal type bureaucracy discusses its core content
core$competent2<-0
core$competent2[core$state==1 & core$fnbpolitical==1]<-1    
core$competent2[(core$role=="defense"|core$role=="jcs") & core$fnbpolitical ==0]<-1 
core$competent2[(core$role=="cia"|core$role=="executive")]<-NA

## subset to observations with more than 100 words and drop the president
corekeep<-core[core$totwords.bcracy>100 & core$role!="president",]

####################
## Outputs the data that is used for bureaucracy level analysis in "Schub_MainResults_APSR.R"
# write.csv(corekeep,"BureaucracyLevel_Polished.csv")
####################


##################################
#### V. Validation Exercises
##################################

########
## Train naive bayes

## Prep training data
trdtm$y<-coding2
iv<-trdtm[,1:634]
dv<-trdtm[,635]

## Use raw scores to classify texts
set.seed(0796)
m<-fnb.multinomial(iv,dv,laplace=1)
predfr<-predict(m,iv,type="rawprob")
predfr<-as.data.frame(predfr)
predfrcl<-ifelse(predfr$political/predfr$military<1,"political","military") #place each text into one of two categories

########
## 10-fold cross-validation to generate performance metrics
set.seed(2)
flds<-createFolds(dv, k=10)
det<-as.data.frame(matrix(NA,nrow=1,ncol=5))
colnames(det)<-c("military","political","coding2","predcl","fold")
for(i in 1:10){
	train<-trdtm[-flds[[i]],]
	trainiv<-train[,1:634]
	traindv<-train[,635]
	test<-trdtm[flds[[i]],]
	testiv<-test[,1:634]
	testdv<-test[,635]
	mf<-fnb.multinomial(trainiv,traindv,laplace=1)
	mfpred<-predict(mf,testiv,type="rawprob")
	mfpred<-as.data.frame(mfpred)
	mfpred$coding2<-testdv
	mfpred$predcl<-ifelse(mfpred$political/mfpred$military<1,"political","military")
	mfpred$fold<-i
	det<-rbind(det,mfpred)
	print(i)
	}
det2<-det[-1,] #ditch the holder first row
det2$correct<-ifelse(det2$coding2==det2$predcl,1,0) #see if prediction is correct
det2$score<-det2$military/det2$political

########
## Performance Evaluation
acc<-mean(det2$correct)
truemil<-nrow(det2[det2$coding2=="military" & det2$correct==1,])
falsemil<-nrow(det2[det2$predcl=="military" & det2$correct==0,])
falsepol<-nrow(det2[det2$predcl=="political" & det2$correct==0,])
precmil<-truemil/(truemil+falsemil)
recmil<-truemil/(truemil+falsepol)
fmil<-(2*precmil*recmil)/(precmil+recmil)

acc  #0.88 accuracy
precmil #0.95 precision
recmil #0.85 recall
fmil #0.90 F-score

########
## Figure A1 in SI
det2$handpolitical<-ifelse(det2$coding2=="political",1,0)

plot(det2$score[det2$correct==1],jitter(det2$handpolitical[det2$correct==1],factor=0.2),pch=19,col="dodgerblue4",xlab="Naive Bayes Raw Score",ylab="Hand Coding",yaxt="n",main="Cross-Validation Performance")
points(det2$score[det2$correct==0],jitter(det2$handpolitical[det2$correct==0],factor=0.2),col="firebrick")
axis(2,at=c(0,1),las=3,labels=c("miltiary","political"))
abline(v=1,lty=3)


########
## Figure 2 in Manuscript

## Get word discrimination
table(dv)
wprobpol<-m$present[2,]/table(dv)[2] #term count in all political texts/count of political texts
wprobmil<-m$present[1,]/table(dv)[1] #term count in all military texts/count of military texts
diffs<-(wprobmil-wprobpol)*-1 #differences between classes for each term
diffsord<-sort(diffs)
milwords<-diffsord[1:25] #distinguishing military words
polwords<-(diffsord[(length(diffsord)-24):length(diffsord)]) #distinguishing political words

## Put military terms in a usable format for plotting
mdf<-as.data.frame(rbind(m$present,diffs))
milwordsdf<-as.data.frame(milwords)
mildf<-mdf[,colnames(mdf) %in% rownames(milwordsdf)]
mildf2<-mildf[,order(mildf[3,])]
mildf3<-as.data.frame(t(mildf2))
mildf3$tot<-(mildf3$military+mildf3$political)

## Put political terms in a usable format for plotting
pdf<-as.data.frame(rbind(m$present,diffs))
polwordsdf<-as.data.frame(polwords)
poldf<-pdf[,colnames(pdf) %in% rownames(polwordsdf)]
poldf2<-poldf[,order(poldf[3,])]
poldf3<-as.data.frame(t(poldf2))
poldf3$tot<-(poldf3$military+poldf3$political)

## Put into a useful presentational scale
allwords<-sum(m$present)
scalar<-0.3
reduce<-3
fiv<-((1/500*allwords)^(scalar))/reduce
twofive<-((1/250*allwords)^(scalar))/reduce
one<-((1/100*allwords)^(scalar))/reduce

## Manuscript Figure 2
#pdf(file="discriminatingwords_CA.pdf",height=8,width=8)
par(mfrow=c(1,1))
par(mar=c(2,1,1,1))
par(oma=c(2,1,1,1))
plot(rev(milwords),(1:length(milwords)),col="white",xlim=c(-0.8,0.4),ylim=c(1,26),ylab="",yaxt="n",xlab="",xaxt="n",bty="n")
#text(milord$diffs,(1:nrow(milord)),rownames(milord),cex=c(1.3))
#text(polord$diffs,(1:nrow(polord)),rownames(polord),cex=c(1.3))
text(rev(milwords),(1:length(milwords)),names(rev(milwords)),cex=c((rev(mildf3$tot^(scalar))/reduce)))
text(polwords,(1:length(polwords)),names(polwords),cex=c(poldf3$tot^(scalar))/reduce)
abline(v=0,lty=3)
axis(1,at=c(-0.64,0.34),labels=c("military","political"),cex.axis=1.4,lwd.ticks=0)
arrows(0.00,0,0.36,0, xpd=T)
arrows(-0.000,0,-0.67,0, xpd=T)
text(-0.42,6.9,"word frequency")
segments(-0.58,6.4,-0.26,6.4)
text(-0.50,5.5,"a",cex=one)
text(-0.50,4.25,"a",cex=twofive)
text(-0.50,3,"a",cex=fiv)
text(-0.42,5.5,"=1/100",cex=1)
text(-0.42,4.25,"=1/250",cex=1)
text(-0.42,3,"=1/500",cex=1)
#dev.off()



########
## Alternative algorithms (referenced in manuscript and SI S3.2)

## Random Forest 
set.seed(07960)
fitcontrol<-trainControl(method="repeatedcv", number=10, verboseIter=T, search="random", repeats=2) 
  #### Line below is slow to run (can alternatively load the results directly)
## fit<-train(y ~ ., data=trdtm, method="rf", trControl=fitcontrol, tuneLength=5)

  ##save(fit,file="rf_fitted.RData")
load("rf_fitted.RData") ## directly load the output rather than running the model above
fit$results #83% maximum accuracy


## SVM
set.seed(07960)
fitcontrolsvm<-trainControl(method="repeatedcv", number=10, verboseIter=TRUE, classProbs=TRUE, repeats=3) 
gridsvm<-expand.grid(C=seq(0.5,2,length=4))

  #### Line below is slow to run (can alternatively load the results directly)
## fitsvm<-train(y ~ ., data=trdtm, method="svmLinear", trControl=fitcontrolsvm, tuneGrid=gridsvm)

  ##save(fitsvm,file="svm_fitted.RData")
load("svm_fitted.RData") ## directly load the output rather than running the model above
fitsvm$results #83% max accuracy







 
 
 